CAS Logo Open main paper 🔗

2  Estimating the five fairness premiums

Objectives

Building on the simulations in Chapter 1, this section estimates the spectrum of fairness. It complements Section 4.2 of the main paper, linked from the supplement’s introduction. We recommend keeping the main paper nearby, as this supplement focuses exclusively on implementation.

Packages for this section
library(tidyverse)
library(latex2exp)
library(jsonlite)

## also setup python for optimal transport. 
library(reticulate)
get_python_env_path <- function(file_path = "python_env_path.txt") {
  if (!file.exists(file_path)) {
    stop("The file 'python_env_path.txt' is missing. Please create it and specify your Python environment path.")
  }
  
  # Read the file and extract the first non-comment, non-empty line
  env_path <- readLines(file_path, warn = FALSE)
  env_path <- env_path[!grepl("^#", env_path) & nzchar(env_path)]
  
  if (length(env_path) == 0) {
    stop("No valid Python environment path found in 'python_env_path.txt'. Please enter your path.")
  }
  
  return(trimws(env_path[1]))
}
tryCatch({
  python_env_path <- get_python_env_path()
  use_python(python_env_path, required = TRUE)
  message("Python environment successfully set to: ", python_env_path)
}, error = function(e) {
  stop(e$message)
})
Data for this section
sims <- fromJSON('simuls/train_scenarios.json')
valid <-  fromJSON('simuls/valid_scenarios.json')
test <- fromJSON('simuls/test_scenarios.json')

## For the experiment in section 5
sim_samples <- jsonlite::fromJSON('simuls/sim_study.json')
Functions and objects from past sections
levels_for_premiums <- c("mu_B", "mu_U", "mu_A", "mu_H", "mu_C")
labels_for_premiums <- c("$\\widehat{\\mu}^B$","$\\widehat{\\mu}^U$", "$\\widehat{\\mu}^A$", "$\\widehat{\\mu}^H$", "$\\widehat{\\mu}^C$")

## four ingredients for the 5 families of premiums
premium_generator <- function(best, pdx, maps_to_corr, marg){
  list("mu_B" = function(x1, x2, d){
    
    ## simple call of the best estimate model
    best(data.frame(X1 = x1,
                    X2 = x2,
                    D = d))
  }, "mu_U" = function(x1, x2, d = NULL){
    
    ## Explicit inference : mu_U = E(mu_B|X)
    tab_best <- sapply(0:1, function(dl){
      best(data.frame(X1 = x1,
                    X2 = x2,
                    D = rep(dl, length(x1)))) 
      }) 
    tab_pdx <- sapply(0:1, function(dl){
      pdx(data.frame(X1 = x1,
                    X2 = x2,
                    D = rep(dl, length(x1))))
      })
    
    (tab_best * tab_pdx) %>% apply(1, sum)
  }, "mu_A" = function(x1, x2, d = NULL){
    
    ## mu_A = E(mu_B) unconditional
    sapply(0:1, function(d){best(data.frame(X1 = x1,
                    X2 = x2,
                    D = rep(d, length(x1))))*
       marg[d + 1]}) %>% apply(1, sum)
  }, "mu_H" = function(x1, x2, d = NULL){
    
    ## explicit inference of mu_C : mu_H = E(mu_C|X)
    tab_corr <- sapply(0:1, function(dl){
      sb <- best(data.frame(X1 = x1,
                    X2 = x2,
                    D = rep(dl, length(x1))))
      maps_to_corr(data.frame(mu_B = sb, D = dl))
      }) 
    tab_pdx <- sapply(0:1, function(dl){
      pdx(data.frame(X1 = x1,
                    X2 = x2,
                    D = rep(dl, length(x1))))
      })
    
    (tab_corr * tab_pdx) %>% apply(1, sum)
  }, "mu_C" = function(x1, x2, d = NULL){
    
    ## mu_C = T^{d}(mu_B(x, d))
    mu_b <- best(data.frame(X1 = x1,
                    X2 = x2,
                    D = d))
    maps_to_corr(data.frame(mu_B = mu_b, D = d))
  })
}

levels_for_quants <- c('proxy_vuln', 'risk_spread', 'fair_range', 'parity_cost')


quant_generator <- function(premiums){
  list('proxy_vuln' = function(x1, x2, d){
    premiums$mu_U(x1 = x1, x2 = x2) - 
      premiums$mu_A(x1 = x1, x2 = x2)
  },
  'risk_spread' = function(x1, x2, x3, d){
    to_minmax <- data.frame(risk1 = premiums$mu_B(x1 = x1,
                                              x2 = x2, 
                                              d = 1),
                            risk0 = premiums$mu_B(x1 = x1,
                                              x2 = x2, 
                                              d = 0))
    apply(to_minmax, 1, max) - apply(to_minmax, 1, min) 
  },
  'fair_range' = function(x1, x2, d){
    to_minmax <- data.frame(mu_b = premiums$mu_B(x1 = x1, x2 = x2, 
                                           d = d),
                           mu_u = premiums$mu_U(x1 = x1, x2 = x2, 
                                          d = NULL),
                           mu_a = premiums$mu_A(x1 = x1, x2 = x2, 
                                          d = NULL),
                           mu_h = premiums$mu_H(x1 = x1, x2 = x2,
                                          d = NULL),
                           mu_c = premiums$mu_C(x1 = x1, x2 = x2, 
                                          d = d))
    
    apply(to_minmax, 1, max) - apply(to_minmax, 1, min) 
  },
  'parity_cost' = function(x1, x2, d){
    premiums$mu_C(x1 = x1, x2 = x2,
              d = d) -
      premiums$mu_B(x1 = x1, x2 = x2, 
                d = d)
  })
}

the_CAS_colors <- c('#FED35D', '#F89708', '#205ED5', '#142345')

In this section, we provide an example of detailed methodology for estimating the benchmarks premiums from the spectrum of fairness defined in Section 4.1 of the main paper. It complements the estimation of Section 4.1.

In this paper, premiums are estimated using lightgbm algorithms from Ke et al. (2017), a package that supports common actuarial distributions (Tweedie, Poisson, Gamma, etc.) and was found to have excellent predictive performance compared to other boosting algorithm in Chevalier and Côté (2024). Given lightgbm’s flexibility, we advocate for careful variable preselection to:

We optimize hyperparameters with a validation set to prevent overfitting. Key hyperparameters in lightgbm are the number of trees (num_iterations), regularization (lambda_l1, lambda_l2) for complexity control, and subsampling percentages (feature_fraction, bagging_fraction) .

The data used to construct prices and the estimated spectrum should align to ensure comparability. In the Case Study of the main paper, we conduct a posteriori benchmarking, as the study period is historical. A posteriori assessment can reveal segments where the model misaligned from fairness pillars after the fact. In contrast, conducting benchmarking concurrently with the development of a new pricing model provides an estimate of how well the commercial price aligns with fairness pillars for future periods, though the true fairness implications of the commercial price will only become evident retrospectively.

Discretizing \(D\)

Subsequent steps involve numerous computations over all possible values of \(D\). When \(D\) is continuous, discretization can improve tractability. The discretized version should be used throughout this appendix, and the discretization function should be kept for future applications. For multivariate \(D\), one approach is to first discretize continuous components, then treat the vector as one categorical variable where each unique combination defines a category. Care should be taken to ensure a manageable number of categories, with sufficient representation in each.

Balancing benchmark premiums

Profit and expense loadings are important, but they must not reintroduce unfairness. To align premium benchmarks with expected profits, we multiply the premiums by a factor \(\delta_j \geq 1~\forall j \in \{B, U, A, H, C\}\) to globally balance all premiums to the level of the commercial price.

In what follows, we detail how we estimate each benchmark premium in our framework.

2.1 Best-estimate premium

The best estimate premium \(\widehat{\mu}^B(\mathbf{x}, d)\) serves as the anchor. Thus, its estimation is crucial for our framework. It provides the best predictor of the response variable \(Y\) when differentiating risks based on \((X, D)\). It can be derived from indicated rates or data-driven (what we do) estimates as detailed in Complement 5 of the main paper.

Training the data-driven best-estimate price
source('___lgb_best_estimate.R')

## clean the pred repo
unlink(file.path('preds', "*_best_estimate.json"))

# Define grid for hyperparameter optimization
  hyperparameter_grid <- expand.grid(
    learning_rate = c(0.01),
    feature_fraction = c(0.75),
    bagging_fraction = c(0.75),
    max_depth = c(5),
    lambda_l1 = c(0.5),
    lambda_l2 = c(0.5)
  )

best_lgb <- setNames(nm = names(sims)) %>% lapply(function(name){
  list_df <- list('train' = sims[[name]],
                  'valid' = valid[[name]],
                  'test' = test[[name]])
  the_best_estimate_lightgbm_fun(list_data = list_df, 
                                 name = name,
                                 hyperparameter_grid = hyperparameter_grid)
})
Best for scenario:  Scenario1  
hyperparam  1 / 1 
Best valid mse: 52.82546  
optimal ntree: 574  
Training time: 9.565609  sec. 
Best for scenario:  Scenario2  
hyperparam  1 / 1 
Best valid mse: 52.5841  
optimal ntree: 482  
Training time: 8.382996  sec. 
Best for scenario:  Scenario3  
hyperparam  1 / 1 
Best valid mse: 52.39612  
optimal ntree: 493  
Training time: 9.366509  sec. 
Training the best-estimate price on experimental data (for later use in section 5)
# Define grid for hyperparameter optimization
hyperparameter_grid_sims <- expand.grid(
    learning_rate = c(0.01),
    bagging_fraction = c(0.75),
    max_depth = c(6),
    lambda_l1 = c(0.5),
    lambda_l2 = c(0.5)
  )

best_sims <- lapply(seq_along(sim_samples), function(idx){
    list_df <- list('train' = sim_samples[[idx]]$train,
                  'valid' = sim_samples[[idx]]$valid,
                  'test' = sim_samples[[idx]]$test)
  the_best_estimate_lightgbm_fun(list_data = list_df,
                                 name = paste0('sim', idx),
                                 hyperparameter_grid = hyperparameter_grid_sims)
})
Best for scenario:  sim1  
hyperparam  1 / 1 
Best valid mse: 60.70295  
optimal ntree: 307  
Training time: 8.505874  sec. 
Best for scenario:  sim2  
hyperparam  1 / 1 
Best valid mse: 52.532  
optimal ntree: 300  
Training time: 5.885618  sec. 
Best for scenario:  sim3  
hyperparam  1 / 1 
Best valid mse: 56.8653  
optimal ntree: 345  
Training time: 6.649762  sec. 
Best for scenario:  sim4  
hyperparam  1 / 1 
Best valid mse: 62.87865  
optimal ntree: 388  
Training time: 6.529138  sec. 
Best for scenario:  sim5  
hyperparam  1 / 1 
Best valid mse: 54.88348  
optimal ntree: 380  
Training time: 5.94952  sec. 
Best for scenario:  sim6  
hyperparam  1 / 1 
Best valid mse: 51.50142  
optimal ntree: 542  
Training time: 7.010613  sec. 
Best for scenario:  sim7  
hyperparam  1 / 1 
Best valid mse: 57.43703  
optimal ntree: 378  
Training time: 6.378983  sec. 
Best for scenario:  sim8  
hyperparam  1 / 1 
Best valid mse: 55.0068  
optimal ntree: 369  
Training time: 4.986214  sec. 
Best for scenario:  sim9  
hyperparam  1 / 1 
Best valid mse: 52.62273  
optimal ntree: 367  
Training time: 4.461421  sec. 
Best for scenario:  sim10  
hyperparam  1 / 1 
Best valid mse: 61.87469  
optimal ntree: 381  
Training time: 6.066156  sec. 
Best for scenario:  sim11  
hyperparam  1 / 1 
Best valid mse: 46.80371  
optimal ntree: 302  
Training time: 5.39969  sec. 
Best for scenario:  sim12  
hyperparam  1 / 1 
Best valid mse: 58.78569  
optimal ntree: 401  
Training time: 6.188616  sec. 
Best for scenario:  sim13  
hyperparam  1 / 1 
Best valid mse: 59.36031  
optimal ntree: 434  
Training time: 6.880865  sec. 
Best for scenario:  sim14  
hyperparam  1 / 1 
Best valid mse: 53.34128  
optimal ntree: 798  
Training time: 7.50502  sec. 
Best for scenario:  sim15  
hyperparam  1 / 1 
Best valid mse: 56.85542  
optimal ntree: 312  
Training time: 4.823647  sec. 
Best for scenario:  sim16  
hyperparam  1 / 1 
Best valid mse: 59.68007  
optimal ntree: 376  
Training time: 5.085136  sec. 
Best for scenario:  sim17  
hyperparam  1 / 1 
Best valid mse: 45.75173  
optimal ntree: 411  
Training time: 5.357519  sec. 
Best for scenario:  sim18  
hyperparam  1 / 1 
Best valid mse: 58.16137  
optimal ntree: 304  
Training time: 4.342801  sec. 
Best for scenario:  sim19  
hyperparam  1 / 1 
Best valid mse: 52.06294  
optimal ntree: 333  
Training time: 4.85933  sec. 
Best for scenario:  sim20  
hyperparam  1 / 1 
Best valid mse: 58.33666  
optimal ntree: 392  
Training time: 5.431267  sec. 
Best for scenario:  sim21  
hyperparam  1 / 1 
Best valid mse: 53.32397  
optimal ntree: 374  
Training time: 5.332651  sec. 
Best for scenario:  sim22  
hyperparam  1 / 1 
Best valid mse: 56.51362  
optimal ntree: 270  
Training time: 5.361021  sec. 
Best for scenario:  sim23  
hyperparam  1 / 1 
Best valid mse: 53.82801  
optimal ntree: 353  
Training time: 8.652734  sec. 
Best for scenario:  sim24  
hyperparam  1 / 1 
Best valid mse: 57.35889  
optimal ntree: 313  
Training time: 7.433861  sec. 
Best for scenario:  sim25  
hyperparam  1 / 1 
Best valid mse: 54.70978  
optimal ntree: 327  
Training time: 8.885597  sec. 
Best for scenario:  sim26  
hyperparam  1 / 1 
Best valid mse: 52.12559  
optimal ntree: 408  
Training time: 10.53373  sec. 
Best for scenario:  sim27  
hyperparam  1 / 1 
Best valid mse: 60.33907  
optimal ntree: 492  
Training time: 10.02974  sec. 
Best for scenario:  sim28  
hyperparam  1 / 1 
Best valid mse: 49.47171  
optimal ntree: 314  
Training time: 6.810398  sec. 
Best for scenario:  sim29  
hyperparam  1 / 1 
Best valid mse: 58.30981  
optimal ntree: 402  
Training time: 8.26865  sec. 
Best for scenario:  sim30  
hyperparam  1 / 1 
Best valid mse: 63.92257  
optimal ntree: 321  
Training time: 7.018725  sec. 
Best for scenario:  sim31  
hyperparam  1 / 1 
Best valid mse: 52.67251  
optimal ntree: 418  
Training time: 10.86446  sec. 
Best for scenario:  sim32  
hyperparam  1 / 1 
Best valid mse: 55.84136  
optimal ntree: 456  
Training time: 16.10204  sec. 
Best for scenario:  sim33  
hyperparam  1 / 1 
Best valid mse: 56.60131  
optimal ntree: 472  
Training time: 16.27423  sec. 
Best for scenario:  sim34  
hyperparam  1 / 1 
Best valid mse: 55.16327  
optimal ntree: 354  
Training time: 11.00037  sec. 
Best for scenario:  sim35  
hyperparam  1 / 1 
Best valid mse: 56.37503  
optimal ntree: 326  
Training time: 7.446382  sec. 
Best for scenario:  sim36  
hyperparam  1 / 1 
Best valid mse: 58.08121  
optimal ntree: 319  
Training time: 10.3268  sec. 
Best for scenario:  sim37  
hyperparam  1 / 1 
Best valid mse: 57.26546  
optimal ntree: 365  
Training time: 8.931815  sec. 
Best for scenario:  sim38  
hyperparam  1 / 1 
Best valid mse: 62.57356  
optimal ntree: 476  
Training time: 7.044137  sec. 
Best for scenario:  sim39  
hyperparam  1 / 1 
Best valid mse: 53.1029  
optimal ntree: 329  
Training time: 4.634873  sec. 
Best for scenario:  sim40  
hyperparam  1 / 1 
Best valid mse: 51.55378  
optimal ntree: 467  
Training time: 6.005701  sec. 
Best for scenario:  sim41  
hyperparam  1 / 1 
Best valid mse: 55.37315  
optimal ntree: 342  
Training time: 6.698183  sec. 
Best for scenario:  sim42  
hyperparam  1 / 1 
Best valid mse: 60.49936  
optimal ntree: 348  
Training time: 6.521605  sec. 
Best for scenario:  sim43  
hyperparam  1 / 1 
Best valid mse: 57.71598  
optimal ntree: 394  
Training time: 6.870026  sec. 
Best for scenario:  sim44  
hyperparam  1 / 1 
Best valid mse: 57.86499  
optimal ntree: 500  
Training time: 12.22347  sec. 
Best for scenario:  sim45  
hyperparam  1 / 1 
Best valid mse: 63.5664  
optimal ntree: 289  
Training time: 9.950408  sec. 
Best for scenario:  sim46  
hyperparam  1 / 1 
Best valid mse: 56.48859  
optimal ntree: 495  
Training time: 12.29663  sec. 
Best for scenario:  sim47  
hyperparam  1 / 1 
Best valid mse: 50.33826  
optimal ntree: 392  
Training time: 10.4937  sec. 
Best for scenario:  sim48  
hyperparam  1 / 1 
Best valid mse: 55.73661  
optimal ntree: 420  
Training time: 5.905298  sec. 
Best for scenario:  sim49  
hyperparam  1 / 1 
Best valid mse: 52.00871  
optimal ntree: 410  
Training time: 5.672499  sec. 
Best for scenario:  sim50  
hyperparam  1 / 1 
Best valid mse: 58.18105  
optimal ntree: 326  
Training time: 5.569694  sec. 
Best for scenario:  sim51  
hyperparam  1 / 1 
Best valid mse: 57.87596  
optimal ntree: 356  
Training time: 4.961786  sec. 
Best for scenario:  sim52  
hyperparam  1 / 1 
Best valid mse: 54.38487  
optimal ntree: 328  
Training time: 4.48847  sec. 
Best for scenario:  sim53  
hyperparam  1 / 1 
Best valid mse: 55.59778  
optimal ntree: 405  
Training time: 5.572506  sec. 
Best for scenario:  sim54  
hyperparam  1 / 1 
Best valid mse: 58.19052  
optimal ntree: 278  
Training time: 4.810443  sec. 
Best for scenario:  sim55  
hyperparam  1 / 1 
Best valid mse: 55.27643  
optimal ntree: 302  
Training time: 6.89241  sec. 
Best for scenario:  sim56  
hyperparam  1 / 1 
Best valid mse: 58.08161  
optimal ntree: 375  
Training time: 8.688806  sec. 
Best for scenario:  sim57  
hyperparam  1 / 1 
Best valid mse: 59.24893  
optimal ntree: 318  
Training time: 8.887513  sec. 
Best for scenario:  sim58  
hyperparam  1 / 1 
Best valid mse: 54.62439  
optimal ntree: 425  
Training time: 8.925592  sec. 
Best for scenario:  sim59  
hyperparam  1 / 1 
Best valid mse: 55.11093  
optimal ntree: 412  
Training time: 5.256221  sec. 
Best for scenario:  sim60  
hyperparam  1 / 1 
Best valid mse: 48.16865  
optimal ntree: 427  
Training time: 5.383948  sec. 
Best for scenario:  sim61  
hyperparam  1 / 1 
Best valid mse: 57.21002  
optimal ntree: 346  
Training time: 5.013891  sec. 
Best for scenario:  sim62  
hyperparam  1 / 1 
Best valid mse: 60.13329  
optimal ntree: 380  
Training time: 7.231489  sec. 
Best for scenario:  sim63  
hyperparam  1 / 1 
Best valid mse: 56.2376  
optimal ntree: 556  
Training time: 10.70739  sec. 
Best for scenario:  sim64  
hyperparam  1 / 1 
Best valid mse: 51.83172  
optimal ntree: 337  
Training time: 6.335815  sec. 
Best for scenario:  sim65  
hyperparam  1 / 1 
Best valid mse: 52.56205  
optimal ntree: 297  
Training time: 6.206158  sec. 
Best for scenario:  sim66  
hyperparam  1 / 1 
Best valid mse: 50.66015  
optimal ntree: 360  
Training time: 6.040714  sec. 
Best for scenario:  sim67  
hyperparam  1 / 1 
Best valid mse: 58.34078  
optimal ntree: 404  
Training time: 6.039014  sec. 
Best for scenario:  sim68  
hyperparam  1 / 1 
Best valid mse: 50.88699  
optimal ntree: 356  
Training time: 5.638683  sec. 
Best for scenario:  sim69  
hyperparam  1 / 1 
Best valid mse: 57.83843  
optimal ntree: 289  
Training time: 5.540577  sec. 
Best for scenario:  sim70  
hyperparam  1 / 1 
Best valid mse: 51.0225  
optimal ntree: 448  
Training time: 9.525315  sec. 
Best for scenario:  sim71  
hyperparam  1 / 1 
Best valid mse: 59.35749  
optimal ntree: 298  
Training time: 5.946549  sec. 
Best for scenario:  sim72  
hyperparam  1 / 1 
Best valid mse: 46.2399  
optimal ntree: 329  
Training time: 6.110875  sec. 
Best for scenario:  sim73  
hyperparam  1 / 1 
Best valid mse: 57.15323  
optimal ntree: 325  
Training time: 7.69956  sec. 
Best for scenario:  sim74  
hyperparam  1 / 1 
Best valid mse: 55.37434  
optimal ntree: 543  
Training time: 7.794781  sec. 
Best for scenario:  sim75  
hyperparam  1 / 1 
Best valid mse: 63.03636  
optimal ntree: 318  
Training time: 5.923949  sec. 
Best for scenario:  sim76  
hyperparam  1 / 1 
Best valid mse: 57.75401  
optimal ntree: 379  
Training time: 6.053141  sec. 
Best for scenario:  sim77  
hyperparam  1 / 1 
Best valid mse: 51.85789  
optimal ntree: 371  
Training time: 5.729835  sec. 
Best for scenario:  sim78  
hyperparam  1 / 1 
Best valid mse: 51.92272  
optimal ntree: 309  
Training time: 5.394738  sec. 
Best for scenario:  sim79  
hyperparam  1 / 1 
Best valid mse: 66.18301  
optimal ntree: 411  
Training time: 5.592659  sec. 
Best for scenario:  sim80  
hyperparam  1 / 1 
Best valid mse: 57.37347  
optimal ntree: 343  
Training time: 6.137712  sec. 
Best for scenario:  sim81  
hyperparam  1 / 1 
Best valid mse: 54.39331  
optimal ntree: 424  
Training time: 6.794946  sec. 
Best for scenario:  sim82  
hyperparam  1 / 1 
Best valid mse: 53.42603  
optimal ntree: 323  
Training time: 5.482156  sec. 
Best for scenario:  sim83  
hyperparam  1 / 1 
Best valid mse: 60.80437  
optimal ntree: 801  
Training time: 10.21739  sec. 
Best for scenario:  sim84  
hyperparam  1 / 1 
Best valid mse: 51.3683  
optimal ntree: 369  
Training time: 6.017342  sec. 
Best for scenario:  sim85  
hyperparam  1 / 1 
Best valid mse: 57.94811  
optimal ntree: 419  
Training time: 5.108502  sec. 
Best for scenario:  sim86  
hyperparam  1 / 1 
Best valid mse: 53.74018  
optimal ntree: 359  
Training time: 5.03959  sec. 
Best for scenario:  sim87  
hyperparam  1 / 1 
Best valid mse: 55.72406  
optimal ntree: 412  
Training time: 6.055299  sec. 
Best for scenario:  sim88  
hyperparam  1 / 1 
Best valid mse: 57.87405  
optimal ntree: 389  
Training time: 7.410019  sec. 
Best for scenario:  sim89  
hyperparam  1 / 1 
Best valid mse: 48.46115  
optimal ntree: 528  
Training time: 7.8749  sec. 
Best for scenario:  sim90  
hyperparam  1 / 1 
Best valid mse: 54.49265  
optimal ntree: 266  
Training time: 4.634579  sec. 
Best for scenario:  sim91  
hyperparam  1 / 1 
Best valid mse: 58.01591  
optimal ntree: 273  
Training time: 4.284013  sec. 
Best for scenario:  sim92  
hyperparam  1 / 1 
Best valid mse: 50.60337  
optimal ntree: 296  
Training time: 4.972096  sec. 
Best for scenario:  sim93  
hyperparam  1 / 1 
Best valid mse: 46.72324  
optimal ntree: 483  
Training time: 5.954047  sec. 
Best for scenario:  sim94  
hyperparam  1 / 1 
Best valid mse: 64.95969  
optimal ntree: 341  
Training time: 5.336904  sec. 
Best for scenario:  sim95  
hyperparam  1 / 1 
Best valid mse: 59.94547  
optimal ntree: 405  
Training time: 6.846918  sec. 
Best for scenario:  sim96  
hyperparam  1 / 1 
Best valid mse: 56.77892  
optimal ntree: 353  
Training time: 6.398632  sec. 
Best for scenario:  sim97  
hyperparam  1 / 1 
Best valid mse: 57.2974  
optimal ntree: 428  
Training time: 6.754522  sec. 
Best for scenario:  sim98  
hyperparam  1 / 1 
Best valid mse: 55.90567  
optimal ntree: 283  
Training time: 5.207212  sec. 
Best for scenario:  sim99  
hyperparam  1 / 1 
Best valid mse: 53.97734  
optimal ntree: 275  
Training time: 5.205323  sec. 
Best for scenario:  sim100  
hyperparam  1 / 1 
Best valid mse: 51.16277  
optimal ntree: 297  
Training time: 4.382574  sec. 

2.2 Unaware premium

The unaware price, \(\mu^U(\mathbf{x})\), excludes direct use of \(D\). It is defined as the best predictor of \(\widehat{\mu}^B(\mathbf{X}, D)\) given \(\mathbf{x}\):
\[\begin{equation*} \widehat{\mu}^U(\mathbf{x}) = E\{\widehat{\mu}^B(\mathbf{x}, D) | X = \mathbf{x}\}. \end{equation*}\]
This maintains consistency with the best estimate premium, whether \(\widehat{\mu}^B\) is data-driven or based on indicated rates (Complement 5 of the main paper). One can use a to predict \(\widehat{\mu}^B(\mathbf{X}, D)\) based on \(\mathbf{X}\). The loss function defaults to mean squared error as it is a reasonable distributional assumption for this response variable.

Alternatively (what we do), one can estimate the propensity and explicitly weight the best-estimate premiums based on predicted propensity. This also keeps consistency with the best-estimate.

Estimating the propensity function
source('___lgb_pdx_estimate.R') 

## clean the preds repo
unlink(file.path('preds', "*_pdx.json"))

  hyperparameter_grid <- expand.grid(
    learning_rate = c(0.01),
    feature_fraction = c(0.75),
    bagging_fraction = c(0.75),
    max_depth = c(5),
    lambda_l1 = c(0.5),
    lambda_l2 = c(0.5)
  )

pdx_lgb <- setNames(nm = names(sims)) %>% lapply(function(name){
  list_df <- list('train' = sims[[name]],
                  'valid' = valid[[name]],
                  'test' = test[[name]])
  the_pdx_lightgbm_fun(list_data = list_df, name = name,
                       hyperparameter_grid = hyperparameter_grid)
})
Pdx for scenario:  Scenario1  
hyperparam  1 / 1 
Best valid binary logloss: 0.5838302  
optimal ntree: 352  
Training time: 8.886383  sec. 
Pdx for scenario:  Scenario2  
hyperparam  1 / 1 
Best valid binary logloss: 0.5838944  
optimal ntree: 362  
Training time: 9.145058  sec. 
Pdx for scenario:  Scenario3  
hyperparam  1 / 1 
Best valid binary logloss: 0.6359648  
optimal ntree: 450  
Training time: 10.83152  sec. 
Training the best-estimate price on experimental data (for later use in section 5)
# Define grid for hyperparameter optimization
hyperparameter_grid_sims <- expand.grid(
    learning_rate = c(0.01),
    bagging_fraction = c(0.75),
    max_depth = c(6),
    lambda_l1 = c(0.5),
    lambda_l2 = c(0.5)
  )

pdx_sims <- lapply(seq_along(sim_samples), function(idx){
    list_df <- list('train' = sim_samples[[idx]]$train,
                  'valid' = sim_samples[[idx]]$valid,
                  'test' = sim_samples[[idx]]$test)
  the_pdx_lightgbm_fun(list_data = list_df, name = paste0('sim', idx),
                       hyperparameter_grid = hyperparameter_grid_sims)
})
Pdx for scenario:  sim1  
hyperparam  1 / 1 
Best valid binary logloss: 0.6125992  
optimal ntree: 234  
Training time: 4.283713  sec. 
Pdx for scenario:  sim2  
hyperparam  1 / 1 
Best valid binary logloss: 0.5945881  
optimal ntree: 149  
Training time: 2.984536  sec. 
Pdx for scenario:  sim3  
hyperparam  1 / 1 
Best valid binary logloss: 0.6199096  
optimal ntree: 133  
Training time: 2.916893  sec. 
Pdx for scenario:  sim4  
hyperparam  1 / 1 
Best valid binary logloss: 0.5903535  
optimal ntree: 279  
Training time: 4.565055  sec. 
Pdx for scenario:  sim5  
hyperparam  1 / 1 
Best valid binary logloss: 0.6080146  
optimal ntree: 201  
Training time: 3.885632  sec. 
Pdx for scenario:  sim6  
hyperparam  1 / 1 
Best valid binary logloss: 0.5874264  
optimal ntree: 231  
Training time: 4.689778  sec. 
Pdx for scenario:  sim7  
hyperparam  1 / 1 
Best valid binary logloss: 0.6084615  
optimal ntree: 182  
Training time: 2.749297  sec. 
Pdx for scenario:  sim8  
hyperparam  1 / 1 
Best valid binary logloss: 0.5675226  
optimal ntree: 313  
Training time: 4.103354  sec. 
Pdx for scenario:  sim9  
hyperparam  1 / 1 
Best valid binary logloss: 0.5985643  
optimal ntree: 165  
Training time: 3.40891  sec. 
Pdx for scenario:  sim10  
hyperparam  1 / 1 
Best valid binary logloss: 0.6250057  
optimal ntree: 166  
Training time: 3.406135  sec. 
Pdx for scenario:  sim11  
hyperparam  1 / 1 
Best valid binary logloss: 0.6203973  
optimal ntree: 147  
Training time: 3.914401  sec. 
Pdx for scenario:  sim12  
hyperparam  1 / 1 
Best valid binary logloss: 0.602498  
optimal ntree: 181  
Training time: 4.711671  sec. 
Pdx for scenario:  sim13  
hyperparam  1 / 1 
Best valid binary logloss: 0.6051516  
optimal ntree: 180  
Training time: 5.021962  sec. 
Pdx for scenario:  sim14  
hyperparam  1 / 1 
Best valid binary logloss: 0.6089953  
optimal ntree: 145  
Training time: 3.94535  sec. 
Pdx for scenario:  sim15  
hyperparam  1 / 1 
Best valid binary logloss: 0.6193706  
optimal ntree: 162  
Training time: 5.093523  sec. 
Pdx for scenario:  sim16  
hyperparam  1 / 1 
Best valid binary logloss: 0.6049338  
optimal ntree: 315  
Training time: 6.677736  sec. 
Pdx for scenario:  sim17  
hyperparam  1 / 1 
Best valid binary logloss: 0.5883746  
optimal ntree: 181  
Training time: 3.876408  sec. 
Pdx for scenario:  sim18  
hyperparam  1 / 1 
Best valid binary logloss: 0.5925265  
optimal ntree: 274  
Training time: 5.048811  sec. 
Pdx for scenario:  sim19  
hyperparam  1 / 1 
Best valid binary logloss: 0.6321457  
optimal ntree: 130  
Training time: 3.309675  sec. 
Pdx for scenario:  sim20  
hyperparam  1 / 1 
Best valid binary logloss: 0.610014  
optimal ntree: 160  
Training time: 3.906293  sec. 
Pdx for scenario:  sim21  
hyperparam  1 / 1 
Best valid binary logloss: 0.5374162  
optimal ntree: 283  
Training time: 5.703081  sec. 
Pdx for scenario:  sim22  
hyperparam  1 / 1 
Best valid binary logloss: 0.5541833  
optimal ntree: 330  
Training time: 7.742988  sec. 
Pdx for scenario:  sim23  
hyperparam  1 / 1 
Best valid binary logloss: 0.5855573  
optimal ntree: 183  
Training time: 4.49776  sec. 
Pdx for scenario:  sim24  
hyperparam  1 / 1 
Best valid binary logloss: 0.5868012  
optimal ntree: 708  
Training time: 10.92581  sec. 
Pdx for scenario:  sim25  
hyperparam  1 / 1 
Best valid binary logloss: 0.5917567  
optimal ntree: 272  
Training time: 6.360571  sec. 
Pdx for scenario:  sim26  
hyperparam  1 / 1 
Best valid binary logloss: 0.5749583  
optimal ntree: 369  
Training time: 6.684613  sec. 
Pdx for scenario:  sim27  
hyperparam  1 / 1 
Best valid binary logloss: 0.5864498  
optimal ntree: 256  
Training time: 6.574841  sec. 
Pdx for scenario:  sim28  
hyperparam  1 / 1 
Best valid binary logloss: 0.6244549  
optimal ntree: 139  
Training time: 2.745347  sec. 
Pdx for scenario:  sim29  
hyperparam  1 / 1 
Best valid binary logloss: 0.5573078  
optimal ntree: 305  
Training time: 4.704959  sec. 
Pdx for scenario:  sim30  
hyperparam  1 / 1 
Best valid binary logloss: 0.6326402  
optimal ntree: 154  
Training time: 3.76924  sec. 
Pdx for scenario:  sim31  
hyperparam  1 / 1 
Best valid binary logloss: 0.5655367  
optimal ntree: 250  
Training time: 7.398465  sec. 
Pdx for scenario:  sim32  
hyperparam  1 / 1 
Best valid binary logloss: 0.5740446  
optimal ntree: 275  
Training time: 8.241881  sec. 
Pdx for scenario:  sim33  
hyperparam  1 / 1 
Best valid binary logloss: 0.600118  
optimal ntree: 166  
Training time: 4.82768  sec. 
Pdx for scenario:  sim34  
hyperparam  1 / 1 
Best valid binary logloss: 0.5763523  
optimal ntree: 342  
Training time: 5.177281  sec. 
Pdx for scenario:  sim35  
hyperparam  1 / 1 
Best valid binary logloss: 0.6369151  
optimal ntree: 141  
Training time: 4.642108  sec. 
Pdx for scenario:  sim36  
hyperparam  1 / 1 
Best valid binary logloss: 0.6102131  
optimal ntree: 168  
Training time: 3.434645  sec. 
Pdx for scenario:  sim37  
hyperparam  1 / 1 
Best valid binary logloss: 0.6188351  
optimal ntree: 162  
Training time: 2.773263  sec. 
Pdx for scenario:  sim38  
hyperparam  1 / 1 
Best valid binary logloss: 0.6131489  
optimal ntree: 215  
Training time: 3.730805  sec. 
Pdx for scenario:  sim39  
hyperparam  1 / 1 
Best valid binary logloss: 0.608278  
optimal ntree: 167  
Training time: 2.981708  sec. 
Pdx for scenario:  sim40  
hyperparam  1 / 1 
Best valid binary logloss: 0.5977083  
optimal ntree: 187  
Training time: 4.174678  sec. 
Pdx for scenario:  sim41  
hyperparam  1 / 1 
Best valid binary logloss: 0.5711532  
optimal ntree: 204  
Training time: 3.737719  sec. 
Pdx for scenario:  sim42  
hyperparam  1 / 1 
Best valid binary logloss: 0.5718066  
optimal ntree: 296  
Training time: 4.667447  sec. 
Pdx for scenario:  sim43  
hyperparam  1 / 1 
Best valid binary logloss: 0.5912995  
optimal ntree: 212  
Training time: 3.737456  sec. 
Pdx for scenario:  sim44  
hyperparam  1 / 1 
Best valid binary logloss: 0.59666  
optimal ntree: 233  
Training time: 3.544378  sec. 
Pdx for scenario:  sim45  
hyperparam  1 / 1 
Best valid binary logloss: 0.585594  
optimal ntree: 304  
Training time: 5.073228  sec. 
Pdx for scenario:  sim46  
hyperparam  1 / 1 
Best valid binary logloss: 0.6001527  
optimal ntree: 215  
Training time: 3.242515  sec. 
Pdx for scenario:  sim47  
hyperparam  1 / 1 
Best valid binary logloss: 0.5785127  
optimal ntree: 170  
Training time: 2.710515  sec. 
Pdx for scenario:  sim48  
hyperparam  1 / 1 
Best valid binary logloss: 0.6086695  
optimal ntree: 164  
Training time: 2.657152  sec. 
Pdx for scenario:  sim49  
hyperparam  1 / 1 
Best valid binary logloss: 0.5867867  
optimal ntree: 395  
Training time: 5.434568  sec. 
Pdx for scenario:  sim50  
hyperparam  1 / 1 
Best valid binary logloss: 0.6008102  
optimal ntree: 255  
Training time: 3.898632  sec. 
Pdx for scenario:  sim51  
hyperparam  1 / 1 
Best valid binary logloss: 0.5603334  
optimal ntree: 278  
Training time: 3.941241  sec. 
Pdx for scenario:  sim52  
hyperparam  1 / 1 
Best valid binary logloss: 0.616842  
optimal ntree: 204  
Training time: 3.592224  sec. 
Pdx for scenario:  sim53  
hyperparam  1 / 1 
Best valid binary logloss: 0.6179536  
optimal ntree: 199  
Training time: 4.496385  sec. 
Pdx for scenario:  sim54  
hyperparam  1 / 1 
Best valid binary logloss: 0.5877936  
optimal ntree: 266  
Training time: 5.045094  sec. 
Pdx for scenario:  sim55  
hyperparam  1 / 1 
Best valid binary logloss: 0.635274  
optimal ntree: 128  
Training time: 3.210545  sec. 
Pdx for scenario:  sim56  
hyperparam  1 / 1 
Best valid binary logloss: 0.6162543  
optimal ntree: 148  
Training time: 4.134681  sec. 
Pdx for scenario:  sim57  
hyperparam  1 / 1 
Best valid binary logloss: 0.5606306  
optimal ntree: 297  
Training time: 4.938671  sec. 
Pdx for scenario:  sim58  
hyperparam  1 / 1 
Best valid binary logloss: 0.6015666  
optimal ntree: 195  
Training time: 4.544127  sec. 
Pdx for scenario:  sim59  
hyperparam  1 / 1 
Best valid binary logloss: 0.6238633  
optimal ntree: 108  
Training time: 2.791945  sec. 
Pdx for scenario:  sim60  
hyperparam  1 / 1 
Best valid binary logloss: 0.5725068  
optimal ntree: 200  
Training time: 2.875574  sec. 
Pdx for scenario:  sim61  
hyperparam  1 / 1 
Best valid binary logloss: 0.5670976  
optimal ntree: 328  
Training time: 4.243049  sec. 
Pdx for scenario:  sim62  
hyperparam  1 / 1 
Best valid binary logloss: 0.5462151  
optimal ntree: 570  
Training time: 5.741335  sec. 
Pdx for scenario:  sim63  
hyperparam  1 / 1 
Best valid binary logloss: 0.6087519  
optimal ntree: 171  
Training time: 2.675122  sec. 
Pdx for scenario:  sim64  
hyperparam  1 / 1 
Best valid binary logloss: 0.5944739  
optimal ntree: 178  
Training time: 3.548949  sec. 
Pdx for scenario:  sim65  
hyperparam  1 / 1 
Best valid binary logloss: 0.6032495  
optimal ntree: 301  
Training time: 4.257425  sec. 
Pdx for scenario:  sim66  
hyperparam  1 / 1 
Best valid binary logloss: 0.5826107  
optimal ntree: 222  
Training time: 4.133112  sec. 
Pdx for scenario:  sim67  
hyperparam  1 / 1 
Best valid binary logloss: 0.6115318  
optimal ntree: 230  
Training time: 4.162833  sec. 
Pdx for scenario:  sim68  
hyperparam  1 / 1 
Best valid binary logloss: 0.5824181  
optimal ntree: 261  
Training time: 6.632526  sec. 
Pdx for scenario:  sim69  
hyperparam  1 / 1 
Best valid binary logloss: 0.566163  
optimal ntree: 255  
Training time: 7.806729  sec. 
Pdx for scenario:  sim70  
hyperparam  1 / 1 
Best valid binary logloss: 0.5897115  
optimal ntree: 233  
Training time: 5.99527  sec. 
Pdx for scenario:  sim71  
hyperparam  1 / 1 
Best valid binary logloss: 0.5890438  
optimal ntree: 228  
Training time: 5.419102  sec. 
Pdx for scenario:  sim72  
hyperparam  1 / 1 
Best valid binary logloss: 0.5908817  
optimal ntree: 266  
Training time: 4.487267  sec. 
Pdx for scenario:  sim73  
hyperparam  1 / 1 
Best valid binary logloss: 0.6255747  
optimal ntree: 162  
Training time: 3.066394  sec. 
Pdx for scenario:  sim74  
hyperparam  1 / 1 
Best valid binary logloss: 0.625483  
optimal ntree: 141  
Training time: 2.424044  sec. 
Pdx for scenario:  sim75  
hyperparam  1 / 1 
Best valid binary logloss: 0.588276  
optimal ntree: 306  
Training time: 5.081996  sec. 
Pdx for scenario:  sim76  
hyperparam  1 / 1 
Best valid binary logloss: 0.6023314  
optimal ntree: 197  
Training time: 3.645914  sec. 
Pdx for scenario:  sim77  
hyperparam  1 / 1 
Best valid binary logloss: 0.593408  
optimal ntree: 236  
Training time: 3.609692  sec. 
Pdx for scenario:  sim78  
hyperparam  1 / 1 
Best valid binary logloss: 0.5972408  
optimal ntree: 549  
Training time: 9.869183  sec. 
Pdx for scenario:  sim79  
hyperparam  1 / 1 
Best valid binary logloss: 0.6015623  
optimal ntree: 202  
Training time: 5.905635  sec. 
Pdx for scenario:  sim80  
hyperparam  1 / 1 
Best valid binary logloss: 0.5671507  
optimal ntree: 342  
Training time: 8.580798  sec. 
Pdx for scenario:  sim81  
hyperparam  1 / 1 
Best valid binary logloss: 0.6202292  
optimal ntree: 200  
Training time: 8.453523  sec. 
Pdx for scenario:  sim82  
hyperparam  1 / 1 
Best valid binary logloss: 0.6069936  
optimal ntree: 196  
Training time: 6.560467  sec. 
Pdx for scenario:  sim83  
hyperparam  1 / 1 
Best valid binary logloss: 0.6042192  
optimal ntree: 155  
Training time: 4.115472  sec. 
Pdx for scenario:  sim84  
hyperparam  1 / 1 
Best valid binary logloss: 0.5875181  
optimal ntree: 381  
Training time: 7.009474  sec. 
Pdx for scenario:  sim85  
hyperparam  1 / 1 
Best valid binary logloss: 0.6032935  
optimal ntree: 244  
Training time: 7.762281  sec. 
Pdx for scenario:  sim86  
hyperparam  1 / 1 
Best valid binary logloss: 0.6042148  
optimal ntree: 188  
Training time: 6.839476  sec. 
Pdx for scenario:  sim87  
hyperparam  1 / 1 
Best valid binary logloss: 0.5850288  
optimal ntree: 191  
Training time: 6.2971  sec. 
Pdx for scenario:  sim88  
hyperparam  1 / 1 
Best valid binary logloss: 0.5862054  
optimal ntree: 249  
Training time: 6.454475  sec. 
Pdx for scenario:  sim89  
hyperparam  1 / 1 
Best valid binary logloss: 0.6066853  
optimal ntree: 164  
Training time: 4.963233  sec. 
Pdx for scenario:  sim90  
hyperparam  1 / 1 
Best valid binary logloss: 0.5803761  
optimal ntree: 318  
Training time: 7.103147  sec. 
Pdx for scenario:  sim91  
hyperparam  1 / 1 
Best valid binary logloss: 0.5886844  
optimal ntree: 398  
Training time: 8.568073  sec. 
Pdx for scenario:  sim92  
hyperparam  1 / 1 
Best valid binary logloss: 0.5597474  
optimal ntree: 382  
Training time: 10.72598  sec. 
Pdx for scenario:  sim93  
hyperparam  1 / 1 
Best valid binary logloss: 0.6097617  
optimal ntree: 171  
Training time: 7.150694  sec. 
Pdx for scenario:  sim94  
hyperparam  1 / 1 
Best valid binary logloss: 0.5863819  
optimal ntree: 279  
Training time: 10.80254  sec. 
Pdx for scenario:  sim95  
hyperparam  1 / 1 
Best valid binary logloss: 0.580913  
optimal ntree: 366  
Training time: 8.922859  sec. 
Pdx for scenario:  sim96  
hyperparam  1 / 1 
Best valid binary logloss: 0.6096536  
optimal ntree: 152  
Training time: 5.485705  sec. 
Pdx for scenario:  sim97  
hyperparam  1 / 1 
Best valid binary logloss: 0.604167  
optimal ntree: 177  
Training time: 5.857431  sec. 
Pdx for scenario:  sim98  
hyperparam  1 / 1 
Best valid binary logloss: 0.6182569  
optimal ntree: 163  
Training time: 5.746379  sec. 
Pdx for scenario:  sim99  
hyperparam  1 / 1 
Best valid binary logloss: 0.5787933  
optimal ntree: 253  
Training time: 7.280911  sec. 
Pdx for scenario:  sim100  
hyperparam  1 / 1 
Best valid binary logloss: 0.6286105  
optimal ntree: 162  
Training time: 5.731817  sec. 

2.3 Aware premium

The aware premium, \(\widehat{\mu}^A(\mathbf{x})\), requires knowledge of the marginal distribution of \(D\). The aware price, a particular case of the ``discrimination-free’’ price of Lindholm et al. (2022), is computed as:
\[\begin{equation*} \widehat{\mu}^A(\mathbf{x}) = \sum_{d\in \mathcal{D}} \widehat{\mu}^B(\mathbf{x}, d) \widehat{\Pr}(D = d). \end{equation*}\] As discussed earlier, we assume \(D\) is discrete or discretized. If the training sample is representative of the target population (portfolio, market, or region?), empirical proportions are consistent estimators of \(\widehat{\Pr}(D = d)\). This is also an estimator suggested in Section 5 of Lindholm et al. (2022).

Computing the empirical proportions for protected supopulations
marg_dist <- sims %>% lapply(function(data){
  data$D %>% table %>%  prop.table
})

## for later use
saveRDS(marg_dist, 'preds/the_empirical_proportions.rds')

## Computing the empirical proportions for protected supopulations on experimental data (for later use in section 5)
marg_dist_sims <- seq_along(sim_samples) %>% lapply(function(idx){
  sim_samples[[idx]]$train$D %>% table %>%  prop.table
})
Note

This method generalizes the use of \(D\) as a control variable to prevent distortions in estimated effect of \(\mathbf{X}\) due to omitted variable bias. It applies to all model types and has a causal interpretation under the assumptions that: (i) possible values of \(\mathbf{X}\) are observed across all groups of \(D\) (no extrapolation), (ii) \(D\) causes both \(\mathbf{X}\) and \(Y\), and (iii) there are no relevant unobserved variables. Under the same assumptions, other methods are estimators of the aware premium. One example is inverse probability weighting, which re-weights observations to (artificially) remove the association between \(\mathbf{X}\) and \(D\), preventing proxy effects.

2.4 Corrective premium

We estimate the corrective premium, \(\widehat{\mu}^C(\mathbf{x}, d)\), designed to enforce strong demographic parity by aligning premium distributions across protected groups. Various methods can achieve this, but we recommend the optimal transport approach of Hu, Ratz, and Charpentier (2024). Its advantage lies in modifying \(\widehat{\mu^B}(\mathbf{x}, d)\) while keeping the overall spectrum estimation anchored to the initial best-estimate premium. As an incremental adjustment, it minimizes modeling complexity while enforcing demographic parity, yielding a simple estimator for the corrective premium.

The corrective premium is computed by training a transport function that shifts best-estimate premiums for each protected group toward a common barycenter, ensuring demographic parity through corrective direct discrimination. See Hu, Ratz, and Charpentier (2024) for details. This optimal transport method is implemented in Python via the Equipy package of Fernandes Machado et al. (2025). Listing 2.1 illustrates how this method, despite its complexity, translates into concise Python code.

Listing 2.1: Illustrative Python code for wasserstein transport toward corrective premiums.
import numpy as np
import pandas as pd
from equipy.fairness import FairWasserstein

# Load best-estimate premiums and associated sensitive attributes
best_est_train = pd.read_csv('best_est_prem_train.csv')  # Training premiums
sens_train = pd.read_csv('sensitive_train.csv')  # discrete sensitive data

# Train Fair Wasserstein transport to adjust premiums
barycenter = FairWasserstein()
barycenter.fit(best_est.values, sens.values)

# Load best-estimate premiums and associated sensitive attributes
best_est_test = pd.read_csv('best_est_prem_test.csv')  # Training premiums
sens_test = pd.read_csv('sensitive_test.csv')  # discrete sensitive data

# Apply transformation to obtain fairness-adjusted premiums
corrective_premiums_test = barycenter.transform(best_est_test.values, sens_test.values)

# Save results
pd.DataFrame(corrective_premiums).to_csv('corr_prem_test.csv', index=False)
Optimal transport and wasserstein distance : technical details

ICI ARTHUR?

Computing the optimal transport mapping for the scenarios
source_python("___opt_transp.py")

## now, the folder 'transported' exist, with one epsilon_y file per scenario

To predict corrective premium on other datasets, one can alternatively train a lightgbm model of mapping from \((X, D)\) to the resulting corrective premiums from Equipy on the training sample.

Computing the optimal transport mapping for the scenarios
source('___lgb_mapping_to_corr.R') 

corr_mapping_lgb <- setNames(nm = names(sims)) %>%
  lapply(the_mapping_to_corr_lightgbm_fun)
Mapping to corr. for scenario:  Scenario1  
Data import: 0.3271101  sec. 
Data conversion: 0.01284289  sec. 
Lgb training : 29.53605  sec. 
Mapping to corr. for scenario:  Scenario2  
Data import: 0.2800891  sec. 
Data conversion: 0.009654999  sec. 
Lgb training : 8.655305  sec. 
Mapping to corr. for scenario:  Scenario3  
Data import: 0.249043  sec. 
Data conversion: 0.03339386  sec. 
Lgb training : 15.7132  sec. 

2.5 Hyperaware premium

The hyperaware premium, \(\widehat{\mu}^H(\mathbf{x})\), is the best non-directly discriminatory approximation of the corrective premium. We construct a lightgbm using only \(\mathbf{X}\) to predict the corrective premium, aiming at demographic parity without direct reliance on \(D\):
\[\begin{equation*} \widehat{\mu}^H(\mathbf{x}) = E\{\widehat{\mu}^C(\mathbf{x}, D) | X = \mathbf{x}\}. \end{equation*}\]
Since the response variable is a premium, MSE is a correct choice of loss function in most cases. Just as for the unaware premium, one can explicitly aggregate corrective premiums across protected groups using estimated propensities as weights. This is what we do here.

2.6 Visualization the estimated spectrum

By combining all the trained models as component into a trained spectrum of fairness, we can define the premium function that will serve to predict the premiums

Defining the estimated premium and metrics functions
premiums <- setNames(nm = names(sims)) %>% lapply(function(name){
  premium_generator(best = best_lgb[[name]]$pred_fun, 
                  pdx = pdx_lgb[[name]]$pred_fun, 
                  maps_to_corr = corr_mapping_lgb[[name]], 
                  marg = marg_dist[[name]])
})

quants <- setNames(nm = names(sims)) %>% lapply(function(name){
  quant_generator(premiums = premiums[[name]])
})


## For experimental data (future use in section 5)
premiums_sims <- setNames(obj = seq_along(sim_samples),
                        nm = names(sim_samples)) %>% lapply(function(idx){
  premium_generator(best = best_sims[[idx]]$pred_fun, 
                  pdx = pdx_sims[[idx]]$pred_fun, 
                  maps_to_corr = corr_mapping_lgb[['Scenario1']], ## We put anything, won't be used anyway as we focus on proxy vulnerabiltiy for section 5. 
                  marg = marg_dist_sims[[idx]])
})

quants_sims <- setNames(obj = seq_along(sim_samples),
                        nm = names(sim_samples)) %>% lapply(function(idx){
  quant_generator(premiums = premiums_sims[[idx]])
})
Computation of estimated premiums and local metrics across the grid
df_to_g_file <- "preds/df_to_g.json"

# Check if the JSON file exists
if (file.exists(df_to_g_file)) {
  message(sprintf("[%s] File exists. Reading df_to_g from %s", Sys.time(), df_to_g_file))
  df_to_g <- fromJSON(df_to_g_file)
} else {

## From the first section of the online supplement
preds_grid_stats_theo <- fromJSON('preds/preds_grid_stats_theo.json')
  
df_to_g <- setNames(nm = names(sims)) %>% lapply(function(name) {
  message(sprintf("[%s] Processing: %s", Sys.time(), name))
  
  local_scenario_df <- preds_grid_stats_theo[[name]]
  
  # Start time for this scenario
  start_time <- Sys.time()
  
  # Step 1: Compute premiums
  message(sprintf("[%s] Step 1: Computing premiums", Sys.time()))
  premium_results <- setNames(nm = levels_for_premiums) %>%
    sapply(function(s) {
      message(sprintf("[%s] Computing premium: %s", Sys.time(), s))
      premiums[[name]][[s]](
        x1 = local_scenario_df$x1,
        x2 = local_scenario_df$x2,
        d = local_scenario_df$d
      )
    })
  
  # Step 2: Compute PDX
  message(sprintf("[%s] Step 2: Computing PDX", Sys.time()))
  pdx_results <- pdx_lgb[[name]]$pred_fun(
    data.frame(
      X1 = local_scenario_df$x1,
      X2 = local_scenario_df$x2,
      D = local_scenario_df$d
    )
  )
  
  # Combine results
  message(sprintf("[%s] Step 5: Combining results", Sys.time()))
  result <- data.frame(
    local_scenario_df,
    premium_results,
    pdx = pdx_results
  )
  
  # Log completion time
  end_time <- Sys.time()
  message(sprintf("[%s] Finished processing: %s (Duration: %.2f seconds)", end_time, name, as.numeric(difftime(end_time, start_time, units = "secs"))))
  
  return(result)
})

# Save the entire df_to_g object to JSON
  message(sprintf("[%s] Saving df_to_g to %s", Sys.time(), df_to_g_file))
  toJSON(df_to_g, pretty = TRUE, auto_unbox = TRUE) %>% write(df_to_g_file)
  rm(df_to_g_theo)
}

grid_stats_path <- 'preds/preds_grid_stats.json'

# Check and load or compute preds_grid_stats
if (file.exists(grid_stats_path)) {
  preds_grid_stats <- fromJSON(grid_stats_path)
} else {
  preds_grid_stats <- setNames(nm = names(df_to_g)) %>% 
    lapply(function(name) {
      data.frame(df_to_g[[name]], 
                 setNames(nm = levels_for_quants) %>% 
                       sapply(function(s) {
                         quants[[name]][[s]](x1 = df_to_g[[name]]$x1,
                                             x2 = df_to_g[[name]]$x2,
                                             d = df_to_g[[name]]$d)
                       }))
    })
  toJSON(preds_grid_stats, pretty = TRUE, auto_unbox = TRUE) %>% 
    write(grid_stats_path)
}
Computing estimated premiums and local metrics for the simulations
pop_to_g_file <- "preds/pop_to_g.json"

# Check if the JSON file exists
if (file.exists(pop_to_g_file)) {
  message(sprintf("[%s] File exists. Reading pop_to_g from %s", Sys.time(), pop_to_g_file))
  pop_to_g <- fromJSON(pop_to_g_file)
} else {
## From the first section of the online supplement
preds_pop_stats_theo <- fromJSON('preds/preds_pop_stats_theo.json')
  
pop_to_g <- setNames(nm = names(sims)) %>% lapply(function(name) {
  message(sprintf("[%s] Processing: %s", Sys.time(), name))
  
  # Start time for this simulation
  start_time <- Sys.time()
  
  list_data <- list('train' = sims[[name]],
                    'valid' = valid[[name]],
                    'test' = test[[name]])
  
  result <- setNames(nm = names(list_data)) %>% lapply(function(nm){
    
    data <- list_data[[nm]]
    
    # Step 1: Compute premiums
  message(sprintf("[%s] Step 1: Computing premiums", Sys.time()))
  premium_results <- setNames(nm = levels_for_premiums) %>%
    sapply(function(s) {
      message(sprintf("[%s] Computing premium: %s", Sys.time(), s))
      premiums[[name]][[s]](
        x1 = data$X1,
        x2 = data$X2,
        d = data$D
      )
    })
  
  # Step 2: Compute PDX
  message(sprintf("[%s] Step 2: Computing PDX", Sys.time()))
  pdx_results <- pdx_lgb[[name]]$pred_fun(
    data.frame(
      X1 = data$X1,
      X2 = data$X2,
      D = data$D
    )
  )
  
  # Combine results
  message(sprintf("[%s] Step 5: Combining results", Sys.time()))
  data.frame(
    preds_pop_stats_theo[[name]][[nm]],
    premium_results,
    pdx = pdx_results
  )
  }) 
  
  
  # Log completion time
  end_time <- Sys.time()
  message(sprintf("[%s] Finished processing: %s (Duration: %.2f seconds)", end_time, name, as.numeric(difftime(end_time, start_time, units = "secs"))))
  
  return(result)
})

# Save the entire df_to_g object to JSON
  message(sprintf("[%s] Saving pop_to_g to %s", Sys.time(), pop_to_g_file))
  toJSON(pop_to_g, pretty = TRUE, auto_unbox = TRUE) %>% write(pop_to_g_file)
  
  rm(pop_to_g_theo)
}


pop_stats_path <- 'preds/preds_pop_stats.json'

# Check and load or compute preds_pop_stats
if (file.exists(pop_stats_path)) {
  preds_pop_stats <- fromJSON(pop_stats_path)
} else {
  preds_pop_stats <- setNames(nm = names(sims)) %>% 
    lapply(function(name) {
      setNames(nm = names(pop_to_g[[name]])) %>%  
        lapply(function(set) {
          local_df <- pop_to_g[[name]][[set]]
          
          data.frame(local_df, 
                 setNames(nm = levels_for_quants) %>% 
                       sapply(function(s) {
                         quants[[name]][[s]](x1 = local_df$X1,
                                             x2 = local_df$X2,
                                             d =  local_df$D)
                       }))
        })
    })
  toJSON(preds_pop_stats, pretty = TRUE, auto_unbox = TRUE) %>% 
    write(pop_stats_path)
}
Computing estimated premiums and local metrics for the experiment
sims_to_g_file <- "preds/sims_to_g.json"

# Check if the JSON file exists
if (file.exists(sims_to_g_file)) {
  message(sprintf("[%s] File exists. Reading sims_to_g from %s", Sys.time(), sims_to_g_file))
  sims_to_g <- fromJSON(sims_to_g_file)
} else {
## From the first section of the online supplement
preds_sims_stats_theo <- fromJSON('preds/preds_sims_stats_theo.json')
sims_to_g <- setNames(object = seq_along(sim_samples), 
                      nm = names(sim_samples)) %>% lapply(function(idx) {
  message(sprintf("[%s] Processing: %s", Sys.time(), paste0(idx, '/', length(sim_samples))))
  
  samples_to_ret <- setNames(nm = names(sim_samples[[idx]])) %>% lapply(function(nm){
  data <- preds_sims_stats_theo[[idx]][[nm]]
    
    # Step 1: Compute premiums
  message(sprintf("[%s] Step 1: Computing premiums", Sys.time()))
  premium_results <- setNames(nm = levels_for_premiums) %>%
    sapply(function(s) {
      message(sprintf("[%s] Computing premium: %s", Sys.time(), s))
      premiums_sims[[idx]][[s]](
        x1 = data$X1,
        x2 = data$X2,
        d = data$D
      )
    })
  
  # Step 2: Compute PDX
  message(sprintf("[%s] Step 2: Computing PDX", Sys.time()))
  pdx_results <- pdx_sims[[idx]]$pred_fun(
    data.frame(
      X1 = data$X1,
      X2 = data$X2,
      D = data$D
    )
  )
  
  
  # Combine results
  message(sprintf("[%s] Step 5: Combining results", Sys.time()))
  result <- data.frame(
    data,
    premium_results,
    pdx = pdx_results
  )
    
  })
  
  return(samples_to_ret)
})

# Save the entire df_to_g object to JSON
  message(sprintf("[%s] Saving sims_to_g to %s", Sys.time(), sims_to_g_file))
  toJSON(sims_to_g, pretty = TRUE, auto_unbox = TRUE) %>% write(sims_to_g_file)
}

sims_stats_path <- 'preds/preds_sims_stats.json'

# Check and load or compute preds_pop_stats
if (file.exists(sims_stats_path)) {
  preds_sims_stats <- fromJSON(sims_stats_path)
} else {
  preds_sims_stats <- setNames(nm = names(sims_to_g)) %>% 
    lapply(function(name) {
      setNames(nm = names(sims_to_g[[name]])) %>%  
        lapply(function(set) {
          local_df <- sims_to_g[[name]][[set]]
          
          data.frame(local_df, 
                 setNames(nm = levels_for_quants) %>% 
                       sapply(function(s) {
                         quants_sims[[name]][[s]](x1 = local_df$X1,
                                             x2 = local_df$X2,
                                             d =  local_df$D)
                       }))
        })
    })
  toJSON(preds_sims_stats, pretty = TRUE, auto_unbox = TRUE) %>% 
    write(sims_stats_path)
}
R code producing the estimated propensity illustration across scenarios
to_save_pdx_perpop <- names(df_to_g) %>% 
  lapply(function(name){
    cols <- the_CAS_colors
    pop_id <- which(names(df_to_g) == name)
    
    ## keep only axis for last plot
    if(pop_id == 1){ # If it's the last, apply correct xlabels
      the_y_scale <- ylim(0,1)
      the_y_label <- latex2exp::TeX("$\\widehat{P}(D = 1|x_1, x_2)$")
    } else { # otherwise, remove everything
      the_y_scale <- scale_y_continuous(labels = NULL, limits = c(0,1))
      the_y_label <- NULL
    }
    
    the_pops <- c('Scenario 1', 'Scenario 2', 'Scenario 3')
    
    ## lets graph
    df_to_g[[name]] %>% 
      mutate(the_population = name) %>% 
  filter(x1 >= -9, x1 <= 11,
         d == 1) %>% 
  group_by(x1, x2) %>% summarise(pdx = mean(pdx)) %>%  ungroup %>% 
  ggplot(aes(x = x1, y = pdx,
             lty = factor(x2),
             linewidth = factor(x2),
             shape = factor(x2),
             alpha = factor(x2),
             color = factor(x2))) +
  geom_line() +
  scale_linetype_manual(values = c('solid', '31', '21', '11'), name = latex2exp::TeX('$x_2$')) +
  scale_color_manual(values = cols, name = latex2exp::TeX('$x_2$')) +
  scale_linewidth_manual(values = c(2, 1, 0.85, 0.55), name = latex2exp::TeX('$x_2$')) +  
  scale_alpha_manual(values = c(0.65, 0.75, 0.85, 0.9), name = latex2exp::TeX('$x_2$')) + 
  labs(x = latex2exp::TeX("$x_1$"),
       y = the_y_label,
       title = latex2exp::TeX(the_pops[pop_id])) + 
  scale_x_continuous(breaks = c(-3:3)*3 + 1)  + # see above
  theme_minimal() + 
  the_y_scale +
  theme(plot.title = element_text(hjust=0.5))
  }) %>% ggpubr::ggarrange(plotlist = .,
                           nrow = 1,
                           widths = 15, heights = 1,
                           common.legend = T,
                           legend = 'right')

if (!dir.exists('figs')) dir.create('figs')
ggsave(filename = "figs/graph_pdx_perpop.png",
       plot = to_save_pdx_perpop,
       height = 3.25,
       width = 7.55,
       units = "in",
       device = "png", dpi = 500)
Figure 2.1: Estimated propensity in terms of \(x_1\) and \(x_2\) for simulations
R code producing the estimated best-estimate, unaware, and aware illustration.
to_save_premiumsdense_BUA_perpop <- names(df_to_g) %>% lapply(function(pop_name){
  
  ## the colors
    cols <- RColorBrewer::brewer.pal(5, 'Spectral')[1:3] %>%  colorspace::darken(0.25)
      # the_CAS_colors_full[c(6, 5, 2)]
  pop_id <- which(names(df_to_g) == pop_name)
  
  ## keep only axis for last plot
    if(pop_name == head(names(df_to_g), 1)){ # If it's the last, apply correct xlabels
      the_y_scale <- scale_y_continuous(breaks = c(90, 110, 130),
                     labels = scales::dollar,
                     limits = c(90, 140))
      the_y_label <- 
  latex2exp::TeX("$\\widehat{\\mu}(x_1, x_2, d)$")
    } else { # otherwise, remove everything
      the_y_scale <- scale_y_continuous(breaks = c(90, 110, 130),
                     labels = NULL,
                     limits = c(90, 140))
      the_y_label <- NULL
    }
  
  to_illustrate <- df_to_g[[pop_name]] %>%
  reshape2::melt(measure.vars = paste0(levels_for_premiums[1:3])) %>% 
    mutate(variable = factor(variable,
                             levels = paste0(levels_for_premiums[1:3]),
                             labels = labels_for_premiums[1:3])) %>% 
  filter(x1 <= 10, x1 >= -8) 
  
  to_illustrate$value[to_illustrate$pdx < 0.1] <- NA
  
  to_illustrate %>% 
  ggplot(aes(x = x1, y = value,
             lty = factor(d),
             linewidth = factor(x2),
             shape = factor(x2),
             alpha = factor(d),
             color = factor(variable))) +
  geom_line() +
  scale_linetype_manual(values = c('solid', '32'), name = latex2exp::TeX('$d$')) +
  scale_color_manual(values = cols, name = latex2exp::TeX('$\\mu$'), labels = latex2exp::TeX) +
  scale_linewidth_manual(values = c(1.5, 1, 0.85, 0.55), name = latex2exp::TeX('$x_2$')) +  
  scale_alpha_manual(values = c(0.35, 0.7), name = latex2exp::TeX('$d$')) + 
  labs(x = latex2exp::TeX("$x_1$"), y = the_y_label,
       title = paste0('Scenario ', pop_id)) +
  scale_x_continuous(breaks = c( -3, 0, 3, 6), limits = c(-4, 7)) +
  the_y_scale +
  theme_minimal()
}) %>% ggpubr::ggarrange(plotlist = .,
                           ncol = 3,
                           widths = c(18, 15, 15), heights = 1,
                           common.legend = T,
                           legend = 'right')

ggsave(filename = "figs/graph_premiumsdense_BUA_perpop.png",
       plot = to_save_premiumsdense_BUA_perpop,
       height = 4,
       width = 10.55,
       units = "in",
       device = "png", dpi = 500)
Figure 2.2: Estimated best-estimate \(\widehat{\mu}^B\), unaware \(\widehat{\mu}^U\), and aware \(\widehat{\mu}^A\) premiums for scenarios 1, 2, and 3 in the Example as a function of \(x_1\), \(x_2\), and \(d\).
R code producing the estimated aware, hyperaware, corrective illustration.
to_save_premiumsdense_AHC_perpop <- names(df_to_g) %>% lapply(function(pop_name){
  
  ## the colors
    cols <- RColorBrewer::brewer.pal(5, 'Spectral')[3:5] %>%  colorspace::darken(0.25)
  pop_id <- which(names(df_to_g) == pop_name)
  
  ## keep only axis for last plot
    if(pop_name == head(names(df_to_g), 1)){ # If it's the last, apply correct xlabels
      the_y_scale <- scale_y_continuous(breaks = c(90, 110, 130),
                     labels = scales::dollar,
                     limits = c(90, 130))
      the_y_label <- 
  latex2exp::TeX("$\\widehat{\\mu}(x_1, x_2, d)$")
    } else { # otherwise, remove everything
      the_y_scale <- scale_y_continuous(breaks = c(90, 110, 130),
                     labels = NULL,
                     limits = c(90, 130))
      the_y_label <- NULL
    }
  
  to_illustrate <- df_to_g[[pop_name]] %>%
  reshape2::melt(measure.vars = paste0(levels_for_premiums[3:5])) %>% 
    mutate(variable = factor(variable,
                             levels = paste0(levels_for_premiums[3:5]),
                             labels = labels_for_premiums[3:5])) %>% 
  filter(x1 <= 10, x1 >= -8) 
  # group_by(x1, d, variable) %>% summarise(value = mean(value)) %>%  ungroup %>% 
  
  ## mask part of the line 
  to_illustrate$value[to_illustrate$pdx < 0.1] <- NA
  
  to_illustrate %>% 
  ggplot(aes(x = x1, y = value,
             lty = factor(d),
             linewidth = factor(x2),
             shape = factor(x2),
             alpha = factor(d),
             color = factor(variable))) +
  geom_line() +
  scale_linetype_manual(values = c('solid', '32'), name = latex2exp::TeX('$d$')) +
  scale_color_manual(values = cols, name = latex2exp::TeX('$\\mu$'), labels = latex2exp::TeX) +
  scale_linewidth_manual(values = c(1.5, 1, 0.85, 0.55), name = latex2exp::TeX('$x_2$')) +  
  scale_alpha_manual(values = c(0.35, 0.7), name = latex2exp::TeX('$d$')) + 
  labs(x = latex2exp::TeX("$x_1$"), y = the_y_label,
       title = paste0('Scenario ', pop_id)) +
  scale_x_continuous(breaks = c( -3, 0, 3, 6), limits = c(-4, 7)) +
  the_y_scale +
  theme_minimal()
}) %>% ggpubr::ggarrange(plotlist = .,
                           ncol = 3,
                           widths = c(18, 15, 15),
                            heights = 1,
                           common.legend = T,
                           legend = 'right')

ggsave(filename = "figs/graph_premiumsdense_AHC_perpop.png",
       plot = to_save_premiumsdense_AHC_perpop,
       height = 4,
       width = 10.55,
       units = "in",
       device = "png", dpi = 500)
Figure 2.3: Estimated aware \(\widehat{\mu}^A\), hyperaware \(\widehat{\mu}^H\), and corrective \(\widehat{\mu}^C\) premiums for scenarios 1, 2, and 3 in the Example as a function of \(x_1\), \(x_2\), and \(d\).